targetFile <- "data/biden.csv"
data <- read.csv(targetFile)
mse <- function(model, data) {
x <- modelr:::residuals(model, data)
mean(x^2, na.rm = TRUE)
}
lin_Biden <- lm(biden ~ age + female + educ + dem + rep, data = data)
lin_mse <- mse(lin_Biden, data)
summary(lin_Biden)
##
## Call:
## lm(formula = biden ~ age + female + educ + dem + rep, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.55 -11.29 1.02 12.78 53.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.8113 3.1244 18.82 < 2e-16 ***
## age 0.0483 0.0282 1.71 0.088 .
## female 4.1032 0.9482 4.33 1.6e-05 ***
## educ -0.3453 0.1948 -1.77 0.076 .
## dem 15.4243 1.0680 14.44 < 2e-16 ***
## rep -15.8495 1.3114 -12.09 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.9 on 1801 degrees of freedom
## Multiple R-squared: 0.282, Adjusted R-squared: 0.28
## F-statistic: 141 on 5 and 1801 DF, p-value: <2e-16
The MSE for the whole set is 395.27.
data_split <- resample_partition(data, c(test = 0.3, train = 0.7))
lin_Biden_train <- lm(biden ~ age + female + educ + dem + rep, data = data_split$train)
lin_mse_val <- mse(lin_Biden_train, data_split$test)
The MSE for of the training set for the test set is 401.628. This is higher than when fitted again the whole set, suggesting the whole set fit led to over fitting.
mseCal <- function(i) {
# 100 different splits
if (i == 30) {
i <- 30.1
}
r <- i/100
split <- resample_partition(data, c(test = r, train = 1 - r))
lin_model <- lm(biden ~ age + female + educ + dem + rep, data = split$train)
mse(lin_model, split$test)
}
set.seed(1234)
df <- data.frame(index = 1:100)
df$mse <- unlist(lapply(df$index, mseCal))
ggplot(data = df, aes(x = index, mse)) + geom_smooth() + geom_point() + labs(title = "MSE vs Percentage of elements in the testing set",
x = "Percentage of elements in the testing set", y = "MSE")
We can see from the plot that most partitions result in an MSE of 400, but both high and low percentages result in higher MSE, dues to over fitting and under fitting respectively. Although \(100%\) inclusion in the training set results in lower MSE, like we saw in section 1.
loocv_data <- crossv_kfold(data, k = nrow(data))
loocv_models <- map(loocv_data$train, ~lm(biden ~ age + female + educ + dem +
rep, data = .))
loocv_mse <- map2_dbl(loocv_models, loocv_data$test, mse)
loocv_mean_mse <- mean(loocv_mse)
The mean MSE under leave-one-out is 397.956. This is about what we saw in the center of the 100 splits and seems to be the lowest we can get without over fitting.
mseFoldCal <- function(i) {
tenFold_data <- crossv_kfold(data, k = 10)
tenFold_models <- map(tenFold_data$train, ~lm(biden ~ age + female + educ +
dem + rep, data = .))
tenFold_mse <- map2_dbl(tenFold_models, tenFold_data$test, mse)
tenFold_mean_mse <- mean(tenFold_mse)
}
set.seed(1234)
ten_fold_df <- data.frame(index = 1:100)
ten_fold_df$mse <- unlist(lapply(ten_fold_df$index, mseFoldCal))
The mean MSE under 10-fold CV is 397.884. This is about what we saw in the center of the 100 splits, and very similar to the mean of the LOOCV, it still seems to be the lowest we can get without over fitting.
qplot(mse, data = ten_fold_df, geom = "histogram", main = "Histogram of model MSE under 10-fold CV",
binwidth = 0.1, xlab = "MSE", ylab = "Count")
The histogram of 10-fold MSEs shows that the MSE is always very close to 398 under different iterations of 10-fold CV.This is very similar to the of the LOOCV which was similar under most cuts.
biden_boot <- data %>% modelr::bootstrap(1000) %>% mutate(model = map(strap,
~lm(biden ~ age + female + educ + dem + rep, data = .)), coef = map(model,
tidy))
biden_boot %>% unnest(coef) %>% group_by(term) %>% summarize(est.boot = mean(estimate),
se.boot = sd(estimate, na.rm = TRUE))
## # A tibble: 6 × 3
## term est.boot se.boot
## <chr> <dbl> <dbl>
## 1 (Intercept) 58.937 2.9781
## 2 age 0.048 0.0288
## 3 dem 15.438 1.1129
## 4 educ -0.353 0.1912
## 5 female 4.103 0.9544
## 6 rep -15.868 1.4452
summary(lin_Biden)
##
## Call:
## lm(formula = biden ~ age + female + educ + dem + rep, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.55 -11.29 1.02 12.78 53.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.8113 3.1244 18.82 < 2e-16 ***
## age 0.0483 0.0282 1.71 0.088 .
## female 4.1032 0.9482 4.33 1.6e-05 ***
## educ -0.3453 0.1948 -1.77 0.076 .
## dem 15.4243 1.0680 14.44 < 2e-16 ***
## rep -15.8495 1.3114 -12.09 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.9 on 1801 degrees of freedom
## Multiple R-squared: 0.282, Adjusted R-squared: 0.28
## F-statistic: 141 on 5 and 1801 DF, p-value: <2e-16
These are very similar estimated values to those provided by the initial fit (e.g. age differs by only \(2.5%\)), but the errors for the the bootstrap are larger.
targetFile <- "data/college.csv"
data <- read.csv(targetFile)
lin_coll <- lm(Outstate ~ ., data = data)
summary(lin_coll)
##
## Call:
## lm(formula = Outstate ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6783 -1267 -41 1244 9953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.59e+03 7.66e+02 -2.07 0.0386 *
## PrivateYes 2.26e+03 2.48e+02 9.13 < 2e-16 ***
## Apps -3.03e-01 6.73e-02 -4.51 7.6e-06 ***
## Accept 8.12e-01 1.29e-01 6.29 5.5e-10 ***
## Enroll -5.49e-01 3.54e-01 -1.55 0.1213
## Top10perc 2.83e+01 1.10e+01 2.58 0.0100 *
## Top25perc -3.78e+00 8.47e+00 -0.45 0.6558
## F.Undergrad -9.57e-02 6.15e-02 -1.55 0.1204
## P.Undergrad 1.17e-02 6.05e-02 0.19 0.8472
## Room.Board 8.82e-01 8.56e-02 10.30 < 2e-16 ***
## Books -4.59e-01 4.48e-01 -1.03 0.3055
## Personal -2.29e-01 1.18e-01 -1.94 0.0528 .
## PhD 1.12e+01 8.73e+00 1.29 0.1982
## Terminal 2.47e+01 9.54e+00 2.59 0.0099 **
## S.F.Ratio -4.64e+01 2.44e+01 -1.90 0.0575 .
## perc.alumni 4.18e+01 7.56e+00 5.53 4.5e-08 ***
## Expend 1.99e-01 2.27e-02 8.77 < 2e-16 ***
## Grad.Rate 2.40e+01 5.51e+00 4.36 1.5e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1960 on 759 degrees of freedom
## Multiple R-squared: 0.768, Adjusted R-squared: 0.763
## F-statistic: 148 on 17 and 759 DF, p-value: <2e-16
Doing a linear fit across all the variables shows lets us pick a few variables that are likely to be significant. Lets look at: Expend, Room.Board and PrivateYes
lin_Expend <- lm(Outstate ~ Expend, data = data)
summary(lin_Expend)
##
## Call:
## lm(formula = Outstate ~ Expend, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15781 -2089 58 2011 7785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.43e+03 2.25e+02 24.2 <2e-16 ***
## Expend 5.18e-01 2.05e-02 25.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2980 on 775 degrees of freedom
## Multiple R-squared: 0.453, Adjusted R-squared: 0.452
## F-statistic: 641 on 1 and 775 DF, p-value: <2e-16
lin_Room.Board <- lm(Outstate ~ Room.Board, data = data)
summary(lin_Room.Board)
##
## Call:
## lm(formula = Outstate ~ Room.Board, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8781 -2071 -351 1877 11877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.4453 447.7679 -0.04 0.97
## Room.Board 2.4000 0.0997 24.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3040 on 775 degrees of freedom
## Multiple R-squared: 0.428, Adjusted R-squared: 0.427
## F-statistic: 580 on 1 and 775 DF, p-value: <2e-16
lin_Apps <- lm(Outstate ~ Apps, data = data)
summary(lin_Apps)
##
## Call:
## lm(formula = Outstate ~ Apps, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8328 -3176 -402 2526 11389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03e+04 1.83e+02 56.3 <2e-16 ***
## Apps 5.21e-02 3.73e-02 1.4 0.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4020 on 775 degrees of freedom
## Multiple R-squared: 0.00252, Adjusted R-squared: 0.00123
## F-statistic: 1.95 on 1 and 775 DF, p-value: 0.162
Looking at the summaries of the models shows that the \(R^2\) values are very low (around \(.4\)) and that the Apps p-value has increased significantly (its \(R^2\) is also low). The best fit, by \(R^2\) is for expend. So lets plot that:
expendDF <- add_predictions(data, lin_Expend)
expendDF <- add_residuals(expendDF, lin_Expend)
ggplot(expendDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() +
labs(title = "Linear model regression for Expend", x = "Predicted expenditure",
y = "Residuals")
The plot show that the model is mostly accurate for expenditures around \(10 000 USD\) but gets worse as the expenditure increases. This likely due to a non-linear term becoming dominant. So lets try a polynomial fit, to find the polynomial we will use 10-fold CV.
set.seed(1234)
tenFold_data <- crossv_kfold(data, k = 10)
polyMSE <- function(d) {
tenFold_models <- map(tenFold_data$train, ~lm(Outstate ~ poly(Expend, d),
data = .))
tenFold_mse <- map2_dbl(tenFold_models, tenFold_data$test, mse)
tenFold_mean_mse <- mean(tenFold_mse)
}
tenFoldDF <- data.frame(index = 1:10)
tenFoldDF$mse <- unlist(lapply(1:10, polyMSE))
ggplot(tenFoldDF, aes(index, mse)) + geom_line() + geom_point() + scale_y_log10() +
labs(title = "MSE vs polynomial fit degree for Expend", x = "Degree", y = "MSE")
We can see that the lowest MSE is obtained for a polynomial of degree 3. So lets SE what that model leads to.
poly3_Expend <- lm(Outstate ~ poly(Expend, 3), data = data)
summary(poly3_Expend)
##
## Call:
## lm(formula = Outstate ~ poly(Expend, 3), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11428 -1513 200 1722 5932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10440.7 91.1 114.59 < 2e-16 ***
## poly(Expend, 3)1 75397.1 2539.8 29.69 < 2e-16 ***
## poly(Expend, 3)2 -41623.0 2539.8 -16.39 < 2e-16 ***
## poly(Expend, 3)3 12483.0 2539.8 4.91 1.1e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2540 on 773 degrees of freedom
## Multiple R-squared: 0.603, Adjusted R-squared: 0.601
## F-statistic: 391 on 3 and 773 DF, p-value: <2e-16
An \(R^2\) higher than \(.5\) that’s a much better fit. Lets plot it:
expendDF <- add_predictions(data, poly3_Expend)
expendDF <- add_residuals(expendDF, poly3_Expend)
ggplot(expendDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() +
labs(title = "3rd order polynomial model regression for Expend", x = "Predicted expenditure",
y = "Residuals")
This model has none of the edge effects of the linear fit and is a much better fit for the data all around. Since that went well, lets try a nonlinear model for the worst fit Apps, but this time a spline. First we will find the optimal number of knots with 3rd order piece wise polynomials:
set.seed(1234)
tenFold_data <- crossv_kfold(data, k = 10)
polyMSE <- function(n) {
tenFold_models <- map(tenFold_data$train, ~glm(Outstate ~ bs(Apps, df = n),
data = .))
tenFold_mse <- map2_dbl(tenFold_models, tenFold_data$test, mse)
tenFold_mean_mse <- mean(tenFold_mse)
}
tenFoldDF <- data.frame(index = 1:10)
tenFoldDF$mse <- unlist(lapply(1:10, polyMSE))
ggplot(tenFoldDF, aes(index, mse)) + geom_line() + geom_point() + labs(title = "MSE vs number of knots",
x = "Number of knots", y = "MSE")
We see the minimum is at 8, but that 4 is almost as low so we will try that.
spline4_Apps <- glm(Outstate ~ bs(Apps, df = 4), data = data)
summary(spline4_Apps)
##
## Call:
## glm(formula = Outstate ~ bs(Apps, df = 4), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8298 -3135 -143 2491 11882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7858 705 11.14 < 2e-16 ***
## bs(Apps, df = 4)1 3026 881 3.44 0.00062 ***
## bs(Apps, df = 4)2 1360 2029 0.67 0.50308
## bs(Apps, df = 4)3 8809 6435 1.37 0.17140
## bs(Apps, df = 4)4 -339 4042 -0.08 0.93314
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 15934896)
##
## Null deviance: 1.2559e+10 on 776 degrees of freedom
## Residual deviance: 1.2302e+10 on 772 degrees of freedom
## AIC: 15098
##
## Number of Fisher Scoring iterations: 2
Still not that good, lets see the plot
expendDF <- add_predictions(data, spline4_Apps)
expendDF <- add_residuals(expendDF, spline4_Apps)
ggplot(expendDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() +
labs(title = "3rd order polynomial model regression for Expend", x = "Predicted expenditure",
y = "Residuals")
Looks like this is still a bad fit and in fact using any kind of polynomial based fit is not good for this data and it likely has very little predictive power regardless.
targetFile <- "data/college.csv"
data <- read.csv(targetFile)
data_split <- resample_partition(data, c(test = 0.3, train = 0.7))
lin_college <- lm(Outstate ~ Private + Room.Board + PhD + perc.alumni + Expend +
Grad.Rate, data = data_split$train)
summary(lin_college)
##
## Call:
## lm(formula = Outstate ~ Private + Room.Board + PhD + perc.alumni +
## Expend + Grad.Rate, data = data_split$train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7851 -1298 -102 1282 10644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3812.153 523.671 -7.28 1.2e-12 ***
## PrivateYes 2736.038 254.142 10.77 < 2e-16 ***
## Room.Board 1.066 0.104 10.20 < 2e-16 ***
## PhD 40.068 6.694 5.99 3.9e-09 ***
## perc.alumni 48.387 9.004 5.37 1.2e-07 ***
## Expend 0.204 0.020 10.20 < 2e-16 ***
## Grad.Rate 25.980 6.715 3.87 0.00012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2070 on 537 degrees of freedom
## Multiple R-squared: 0.751, Adjusted R-squared: 0.749
## F-statistic: 271 on 6 and 537 DF, p-value: <2e-16
The summary shows that all the values are significant and that together their \(R^2\) indicates \(75%\) of the variance is accounted for. We can look at the residuals plot to see if were the \(25%\) may be coming from
fullDF <- add_predictions(data, lin_college)
fullDF <- add_residuals(fullDF, lin_college)
ggplot(fullDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + labs(title = "Residuals vs predicted values for the full linear fit",
x = "Predicted expenditure", y = "Residuals")
The error appears to be largest for large predicted values, lets see if a a GAM can improve this. We will use the sum of loess fits for our continuous variables since the loess fits is a nice smooth fit and should work with most distributions.
college_gam <- gam(Outstate ~ lo(perc.alumni) + lo(PhD) + lo(Expend) + lo(Grad.Rate) +
lo(Room.Board) + Private, data = data_split$train)
summary(college_gam)
##
## Call: gam(formula = Outstate ~ lo(perc.alumni) + lo(PhD) + lo(Expend) +
## lo(Grad.Rate) + lo(Room.Board) + Private, data = data_split$train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6904.3 -1138.8 33.4 1189.9 7157.0
##
## (Dispersion Parameter for gaussian family taken to be 3361360)
##
## Null Deviance: 9.22e+09 on 543 degrees of freedom
## Residual Deviance: 1.76e+09 on 522 degrees of freedom
## AIC: 9742
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## lo(perc.alumni) 1 2.32e+09 2.32e+09 689 <2e-16 ***
## lo(PhD) 1 5.23e+08 5.23e+08 156 <2e-16 ***
## lo(Expend) 1 1.96e+09 1.96e+09 583 <2e-16 ***
## lo(Grad.Rate) 1 3.79e+08 3.79e+08 113 <2e-16 ***
## lo(Room.Board) 1 4.88e+08 4.88e+08 145 <2e-16 ***
## Private 1 3.88e+08 3.88e+08 115 <2e-16 ***
## Residuals 522 1.76e+09 3.36e+06
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## lo(perc.alumni) 2.4 4.22 0.0108 *
## lo(PhD) 2.7 2.70 0.0507 .
## lo(Expend) 4.1 29.90 <2e-16 ***
## lo(Grad.Rate) 2.7 2.35 0.0787 .
## lo(Room.Board) 2.9 4.16 0.0069 **
## Private
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Even better \(R^2\) values lets look at the residuals plot
gamDF <- add_predictions(data, college_gam)
gamDF <- add_residuals(gamDF, college_gam)
ggplot(gamDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + labs(title = "Residuals vs predicted values for the full linear fit",
x = "Predicted expenditure", y = "Residuals")
Much less edge effect, it looks like the GAM model has much lower error. Lets look at a few of the individual components:
college_gam_terms <- preplot(college_gam, se = TRUE, rug = FALSE)
data_frame(x = college_gam_terms$`lo(perc.alumni)`$x, y = college_gam_terms$`lo(perc.alumni)`$y,
se.fit = college_gam_terms$`lo(perc.alumni)`$se.y) %>% mutate(y_low = y -
1.96 * se.fit, y_high = y + 1.96 * se.fit) %>% ggplot(aes(x, y)) + geom_line() +
geom_line(aes(y = y_low), linetype = 2) + geom_line(aes(y = y_high), linetype = 2) +
labs(title = "GAM of out-of-state tuition", x = "Percentage of donating alumni",
y = expression(f[1](perc.alumni)))
We can see that higher rates of alumni donations tend to increase out-of-state tuition.
data_frame(x = college_gam_terms$`lo(PhD)`$x, y = college_gam_terms$`lo(PhD)`$y,
se.fit = college_gam_terms$`lo(PhD)`$se.y) %>% mutate(y_low = y - 1.96 *
se.fit, y_high = y + 1.96 * se.fit) %>% ggplot(aes(x, y)) + geom_line() +
geom_line(aes(y = y_low), linetype = 2) + geom_line(aes(y = y_high), linetype = 2) +
labs(title = "GAM of out-of-state tuition", x = "Percentage of faculty with PhDs",
y = expression(f[1](PhD)))
We can see that higher rates of faculty with PhD’s tend to increase out-of-state tuition in general but there is a local maxima around \(30%\), even when extend to \(95%\) confidence interval.
data_frame(x = college_gam_terms$Private$x, y = college_gam_terms$Private$y,
se.fit = college_gam_terms$Private$se.y) %>% unique %>% mutate(y_low = y -
1.96 * se.fit, y_high = y + 1.96 * se.fit, x = factor(x, levels = c("No",
"Yes"), labels = c("Public", "Private"))) %>% ggplot(aes(x, y, ymin = y_low,
ymax = y_high)) + geom_errorbar() + geom_point() + labs(title = "GAM of out-of-state tuition",
x = NULL, y = expression(f[3](gender)))
We can see that being private has a statistical significant and large effect on out-of-state tuition.
lin_mse <- mse(lin_college, data_split$test)
gam_mse <- mse(college_gam, data_split$test)
The MSE for the linear model is 3.84810^{6} while for the GAM is gam_mse. This is not that large of a difference. Going to the much more complicated GAM yields only \(.3%\) improvement. This is likely due to the GAME being over fitted, using polynomials or less other less complicated components in the GAM than loess fits could likely improve the MSE by reducing the levels of over fitting.
summary(college_gam)
##
## Call: gam(formula = Outstate ~ lo(perc.alumni) + lo(PhD) + lo(Expend) +
## lo(Grad.Rate) + lo(Room.Board) + Private, data = data_split$train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6904.3 -1138.8 33.4 1189.9 7157.0
##
## (Dispersion Parameter for gaussian family taken to be 3361360)
##
## Null Deviance: 9.22e+09 on 543 degrees of freedom
## Residual Deviance: 1.76e+09 on 522 degrees of freedom
## AIC: 9742
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## lo(perc.alumni) 1 2.32e+09 2.32e+09 689 <2e-16 ***
## lo(PhD) 1 5.23e+08 5.23e+08 156 <2e-16 ***
## lo(Expend) 1 1.96e+09 1.96e+09 583 <2e-16 ***
## lo(Grad.Rate) 1 3.79e+08 3.79e+08 113 <2e-16 ***
## lo(Room.Board) 1 4.88e+08 4.88e+08 145 <2e-16 ***
## Private 1 3.88e+08 3.88e+08 115 <2e-16 ***
## Residuals 522 1.76e+09 3.36e+06
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## lo(perc.alumni) 2.4 4.22 0.0108 *
## lo(PhD) 2.7 2.70 0.0507 .
## lo(Expend) 4.1 29.90 <2e-16 ***
## lo(Grad.Rate) 2.7 2.35 0.0787 .
## lo(Room.Board) 2.9 4.16 0.0069 **
## Private
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA test indicates that Expend is with a high likelihood non-linear, while Room.Board is also quite possibly no-linear. Lets look at the plot of Expend
data_frame(x = college_gam_terms$`lo(Expend)`$x, y = college_gam_terms$`lo(Expend)`$y,
se.fit = college_gam_terms$`lo(Expend)`$se.y) %>% mutate(y_low = y - 1.96 *
se.fit, y_high = y + 1.96 * se.fit) %>% ggplot(aes(x, y)) + geom_line() +
geom_line(aes(y = y_low), linetype = 2) + geom_line(aes(y = y_high), linetype = 2) +
labs(title = "GAM of out-of-state tuition", x = "Expenditure per student",
y = expression(f[1](Expend)))
As you can see from the plot the expenditure per student is non-linear and we also saw this effecting our ability to use it in part 2.